In [14]:
from __future__ import print_function
import os.path
import dalmatian as dm
import pandas as pd
import sys
sys.path.insert(0, '../../')
#import Datanalytics as da 
from JKBio import TerraFunction as terra
%load_ext autoreload
%autoreload 2
from JKBio import Helper as h

import pickle
from taigapy import TaigaClient
tc = TaigaClient()
import numpy as np
import itertools

from bokeh.plotting import *
from bokeh.models import HoverTool
output_notebook()
import matplotlib.pyplot as plt

import seaborn as sns
import gseapy
from JKBio.helper import pyDESeq2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering

from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loading BokehJS ...
In [7]:
! gsutil mv gs://transfer-amlproject/*MP7624* gs://transfer-amlproject/RNPv2/
Copying gs://transfer-amlproject/20200304_10_MP7624_S10_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_10_MP7624_S10_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_10_MP7624_S10_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_10_MP7624_S10_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_11_MP7624_S11_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_11_MP7624_S11_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_11_MP7624_S11_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_11_MP7624_S11_R2_001.fastq.gz...     

==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://transfer-amlproject/20200304_12_MP7624_S12_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_12_MP7624_S12_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_12_MP7624_S12_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_12_MP7624_S12_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_13_MP7624_S13_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_13_MP7624_S13_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_13_MP7624_S13_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_13_MP7624_S13_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_14_MP7624_S14_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_14_MP7624_S14_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_14_MP7624_S14_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_14_MP7624_S14_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_15_MP7624_S15_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_15_MP7624_S15_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_15_MP7624_S15_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_15_MP7624_S15_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_16_MP7624_S16_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_16_MP7624_S16_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_16_MP7624_S16_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_16_MP7624_S16_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_17_MP7624_S17_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_17_MP7624_S17_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_17_MP7624_S17_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_17_MP7624_S17_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_18_MP7624_S18_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_18_MP7624_S18_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_18_MP7624_S18_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_18_MP7624_S18_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_19_MP7624_S19_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_19_MP7624_S19_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_19_MP7624_S19_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_19_MP7624_S19_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_1_MP7624_S1_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_1_MP7624_S1_R1_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_1_MP7624_S1_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_1_MP7624_S1_R2_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_20_MP7624_S20_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_20_MP7624_S20_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_20_MP7624_S20_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_20_MP7624_S20_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_21_MP7624_S21_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_21_MP7624_S21_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_21_MP7624_S21_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_21_MP7624_S21_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_22_MP7624_S22_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_22_MP7624_S22_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_22_MP7624_S22_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_22_MP7624_S22_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_23_MP7624_S23_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_23_MP7624_S23_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_23_MP7624_S23_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_23_MP7624_S23_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_24_MP7624_S24_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_24_MP7624_S24_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_24_MP7624_S24_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_24_MP7624_S24_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_25_MP7624_S25_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_25_MP7624_S25_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_25_MP7624_S25_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_25_MP7624_S25_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_26_MP7624_S26_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_26_MP7624_S26_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_26_MP7624_S26_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_26_MP7624_S26_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_27_MP7624_S27_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_27_MP7624_S27_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_27_MP7624_S27_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_27_MP7624_S27_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_28_MP7624_S28_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_28_MP7624_S28_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_28_MP7624_S28_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_28_MP7624_S28_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_29_MP7624_S29_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_29_MP7624_S29_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_29_MP7624_S29_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_29_MP7624_S29_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_2_MP7624_S2_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_2_MP7624_S2_R1_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_2_MP7624_S2_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_2_MP7624_S2_R2_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_30_MP7624_S30_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_30_MP7624_S30_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_30_MP7624_S30_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_30_MP7624_S30_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_31_MP7624_S31_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_31_MP7624_S31_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_31_MP7624_S31_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_31_MP7624_S31_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_32_MP7624_S32_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_32_MP7624_S32_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_32_MP7624_S32_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_32_MP7624_S32_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_33_MP7624_S33_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_33_MP7624_S33_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_33_MP7624_S33_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_33_MP7624_S33_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_34_MP7624_S34_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_34_MP7624_S34_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_34_MP7624_S34_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_34_MP7624_S34_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_35_MP7624_S35_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_35_MP7624_S35_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_35_MP7624_S35_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_35_MP7624_S35_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_36_MP7624_S36_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_36_MP7624_S36_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_36_MP7624_S36_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_36_MP7624_S36_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_37_MP7624_S37_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_37_MP7624_S37_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_37_MP7624_S37_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_37_MP7624_S37_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_38_MP7624_S38_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_38_MP7624_S38_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_38_MP7624_S38_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_38_MP7624_S38_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_39_MP7624_S39_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_39_MP7624_S39_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_39_MP7624_S39_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_39_MP7624_S39_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_3_MP7624_S3_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_3_MP7624_S3_R1_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_3_MP7624_S3_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_3_MP7624_S3_R2_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_40_MP7624_S40_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_40_MP7624_S40_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_40_MP7624_S40_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_40_MP7624_S40_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_41_MP7624_S41_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_41_MP7624_S41_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_41_MP7624_S41_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_41_MP7624_S41_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_42_MP7624_S42_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_42_MP7624_S42_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_42_MP7624_S42_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_42_MP7624_S42_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_43_MP7624_S43_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_43_MP7624_S43_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_43_MP7624_S43_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_43_MP7624_S43_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_44_MP7624_S44_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_44_MP7624_S44_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_44_MP7624_S44_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_44_MP7624_S44_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_45_MP7624_S45_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_45_MP7624_S45_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_45_MP7624_S45_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_45_MP7624_S45_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_46_MP7624_S46_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_46_MP7624_S46_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_46_MP7624_S46_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_46_MP7624_S46_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_47_MP7624_S47_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_47_MP7624_S47_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_47_MP7624_S47_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_47_MP7624_S47_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_48_MP7624_S48_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_48_MP7624_S48_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_48_MP7624_S48_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_48_MP7624_S48_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_49_MP7624_S49_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_49_MP7624_S49_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_49_MP7624_S49_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_49_MP7624_S49_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_4_MP7624_S4_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_4_MP7624_S4_R1_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_4_MP7624_S4_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_4_MP7624_S4_R2_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_50_MP7624_S50_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_50_MP7624_S50_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_50_MP7624_S50_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_50_MP7624_S50_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_51_MP7624_S51_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_51_MP7624_S51_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_51_MP7624_S51_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_51_MP7624_S51_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_52_MP7624_S52_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_52_MP7624_S52_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_52_MP7624_S52_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_52_MP7624_S52_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_53_MP7624_S53_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_53_MP7624_S53_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_53_MP7624_S53_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_53_MP7624_S53_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_54_MP7624_S54_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_54_MP7624_S54_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_54_MP7624_S54_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_54_MP7624_S54_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_55_MP7624_S55_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_55_MP7624_S55_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_55_MP7624_S55_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_55_MP7624_S55_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_56_MP7624_S56_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_56_MP7624_S56_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_56_MP7624_S56_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_56_MP7624_S56_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_57_MP7624_S57_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_57_MP7624_S57_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_57_MP7624_S57_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_57_MP7624_S57_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_58_MP7624_S58_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_58_MP7624_S58_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_58_MP7624_S58_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_58_MP7624_S58_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_59_MP7624_S59_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_59_MP7624_S59_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_59_MP7624_S59_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_59_MP7624_S59_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_5_MP7624_S5_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_5_MP7624_S5_R1_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_5_MP7624_S5_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_5_MP7624_S5_R2_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_60_MP7624_S60_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_60_MP7624_S60_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_60_MP7624_S60_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_60_MP7624_S60_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_61_MP7624_S61_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_61_MP7624_S61_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_61_MP7624_S61_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_61_MP7624_S61_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_62_MP7624_S62_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_62_MP7624_S62_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_62_MP7624_S62_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_62_MP7624_S62_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_63_MP7624_S63_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_63_MP7624_S63_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_63_MP7624_S63_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_63_MP7624_S63_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_64_MP7624_S64_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_64_MP7624_S64_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_64_MP7624_S64_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_64_MP7624_S64_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_65_MP7624_S65_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_65_MP7624_S65_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_65_MP7624_S65_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_65_MP7624_S65_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_66_MP7624_S66_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_66_MP7624_S66_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_66_MP7624_S66_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_66_MP7624_S66_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_67_MP7624_S67_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_67_MP7624_S67_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_67_MP7624_S67_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_67_MP7624_S67_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_68_MP7624_S68_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_68_MP7624_S68_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_68_MP7624_S68_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_68_MP7624_S68_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_69_MP7624_S69_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_69_MP7624_S69_R1_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_69_MP7624_S69_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_69_MP7624_S69_R2_001.fastq.gz...     
Copying gs://transfer-amlproject/20200304_6_MP7624_S6_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_6_MP7624_S6_R1_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_6_MP7624_S6_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_6_MP7624_S6_R2_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_7_MP7624_S7_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_7_MP7624_S7_R1_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_7_MP7624_S7_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_7_MP7624_S7_R2_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_8_MP7624_S8_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_8_MP7624_S8_R1_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_8_MP7624_S8_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_8_MP7624_S8_R2_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_9_MP7624_S9_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_9_MP7624_S9_R1_001.fastq.gz...       
Copying gs://transfer-amlproject/20200304_9_MP7624_S9_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Removing gs://transfer-amlproject/20200304_9_MP7624_S9_R2_001.fastq.gz...       

==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.


Operation completed over 138 objects/240.6 GiB.                                  
In [8]:
! gsutil -m cp -r gs://transfer-amlproject/RNPv2/ gs://amlproject/
Copying gs://transfer-amlproject/RNPv2/20200304_10_MP7624_S10_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_10_MP7624_S10_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_11_MP7624_S11_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_11_MP7624_S11_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_12_MP7624_S12_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_12_MP7624_S12_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_13_MP7624_S13_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_14_MP7624_S14_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_13_MP7624_S13_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_14_MP7624_S14_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_15_MP7624_S15_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_15_MP7624_S15_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_16_MP7624_S16_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_16_MP7624_S16_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_17_MP7624_S17_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_17_MP7624_S17_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_18_MP7624_S18_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_18_MP7624_S18_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_19_MP7624_S19_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_19_MP7624_S19_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_1_MP7624_S1_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_1_MP7624_S1_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_20_MP7624_S20_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_20_MP7624_S20_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_21_MP7624_S21_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_21_MP7624_S21_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_22_MP7624_S22_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_22_MP7624_S22_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_23_MP7624_S23_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_23_MP7624_S23_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_24_MP7624_S24_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_24_MP7624_S24_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_25_MP7624_S25_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_25_MP7624_S25_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_26_MP7624_S26_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_26_MP7624_S26_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_27_MP7624_S27_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_27_MP7624_S27_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_28_MP7624_S28_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_29_MP7624_S29_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_28_MP7624_S28_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_2_MP7624_S2_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_29_MP7624_S29_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_2_MP7624_S2_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_30_MP7624_S30_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_30_MP7624_S30_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_31_MP7624_S31_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_32_MP7624_S32_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_31_MP7624_S31_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_33_MP7624_S33_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_32_MP7624_S32_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_33_MP7624_S33_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_34_MP7624_S34_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_34_MP7624_S34_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_35_MP7624_S35_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_48_MP7624_S48_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_35_MP7624_S35_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_36_MP7624_S36_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_36_MP7624_S36_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_37_MP7624_S37_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_38_MP7624_S38_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_37_MP7624_S37_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_3_MP7624_S3_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_38_MP7624_S38_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_39_MP7624_S39_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_39_MP7624_S39_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_40_MP7624_S40_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_3_MP7624_S3_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_40_MP7624_S40_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_42_MP7624_S42_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_41_MP7624_S41_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_47_MP7624_S47_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_44_MP7624_S44_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_41_MP7624_S41_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_42_MP7624_S42_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_43_MP7624_S43_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_45_MP7624_S45_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_43_MP7624_S43_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_44_MP7624_S44_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_45_MP7624_S45_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_46_MP7624_S46_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_46_MP7624_S46_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_49_MP7624_S49_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_47_MP7624_S47_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_49_MP7624_S49_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_4_MP7624_S4_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_51_MP7624_S51_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_4_MP7624_S4_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_48_MP7624_S48_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_51_MP7624_S51_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_50_MP7624_S50_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_50_MP7624_S50_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_52_MP7624_S52_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_53_MP7624_S53_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_52_MP7624_S52_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_53_MP7624_S53_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_54_MP7624_S54_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_54_MP7624_S54_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_55_MP7624_S55_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_55_MP7624_S55_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_56_MP7624_S56_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_56_MP7624_S56_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_57_MP7624_S57_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_57_MP7624_S57_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_58_MP7624_S58_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_58_MP7624_S58_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_59_MP7624_S59_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_59_MP7624_S59_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_5_MP7624_S5_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_5_MP7624_S5_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_60_MP7624_S60_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_60_MP7624_S60_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_61_MP7624_S61_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_61_MP7624_S61_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_62_MP7624_S62_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_62_MP7624_S62_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_63_MP7624_S63_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_63_MP7624_S63_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_64_MP7624_S64_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_64_MP7624_S64_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_65_MP7624_S65_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_65_MP7624_S65_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_66_MP7624_S66_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_66_MP7624_S66_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_67_MP7624_S67_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_68_MP7624_S68_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_68_MP7624_S68_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_67_MP7624_S67_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_69_MP7624_S69_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_6_MP7624_S6_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_69_MP7624_S69_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_6_MP7624_S6_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_7_MP7624_S7_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_7_MP7624_S7_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_8_MP7624_S8_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_8_MP7624_S8_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_9_MP7624_S9_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying gs://transfer-amlproject/RNPv2/20200304_9_MP7624_S9_R2_001.fastq.gz [Content-Type=application/octet-stream]...
\ [138/138 files][240.6 GiB/240.6 GiB] 100% Done                                
Operation completed over 138 objects/240.6 GiB.                                  
In [10]:
! gsutil ls gs://amlproject/
gs://amlproject/MV-4-11.bai
gs://amlproject/MV-4-11.bam
gs://amlproject/Chip/
gs://amlproject/RNA/
gs://amlproject/RNPv2/
In [12]:
sampleset='MAX_AML_RNPv2'
In [ ]:
! gsutil -m cp -r gs://amlproject/RNPv2/ ../../data
In [209]:
! bwa index -a bwtsw ../data/ERCC92/ERCC92.fa
[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[BWTIncCreate] textLength=165512, availableWord=210772
[bwt_gen] Finished constructing BWT in 5 iterations.
[bwa_index] 0.02 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.01 sec
[main] Version: 0.7.5-r404
[main] CMD: bwa index -a bwtsw ../data/ERCC92/ERCC92.fa
[main] Real time: 0.162 sec; CPU: 0.032 sec
In [210]:
! samtools faidx ../data/ERCC92/ERCC92.fa
In [215]:
from JKBio import Helper as h
In [8]:
! ../../TrimGalore-0.6.5/trim_galore
Multicore support not enabled. Proceeding with single-core trimming.
Path to Cutadapt set as: 'cutadapt' (default)
Cutadapt seems to be working fine (tested command 'cutadapt --version')
Cutadapt version: 2.8
single-core operation.
No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)


Please provide the filename(s) of one or more FastQ file(s) to launch Trim Galore!

USAGE:  'trim_galore [options] <filename(s)>'    or    'trim_galore --help'    for more options

In [11]:
ls -alh res
total 138M
drwxr-xr-x 2 jeremie jeremie 4.0K Mar 13 18:48 ./
drwxr-xr-x 5 jeremie jeremie 4.0K Mar 13 18:48 ../
-rw-r--r-- 1 jeremie jeremie  764 Mar 13 18:48 20200304_10_MP7624_S10_R1_001.fastq.gz_trimming_report.txt
-rw-r--r-- 1 jeremie jeremie  15M Mar 13 18:48 20200304_10_MP7624_S10_R1_001_trimmed.fq.gz
-rw-r--r-- 1 jeremie jeremie  764 Mar 13 18:48 20200304_11_MP7624_S11_R1_001.fastq.gz_trimming_report.txt
-rw-r--r-- 1 jeremie jeremie  14M Mar 13 18:48 20200304_11_MP7624_S11_R1_001_trimmed.fq.gz
-rw-r--r-- 1 jeremie jeremie  764 Mar 13 18:48 20200304_12_MP7624_S12_R1_001.fastq.gz_trimming_report.txt
-rw-r--r-- 1 jeremie jeremie  15M Mar 13 18:48 20200304_12_MP7624_S12_R1_001_trimmed.fq.gz
-rw-r--r-- 1 jeremie jeremie  764 Mar 13 18:48 20200304_13_MP7624_S13_R1_001.fastq.gz_trimming_report.txt
-rw-r--r-- 1 jeremie jeremie  14M Mar 13 18:48 20200304_13_MP7624_S13_R1_001_trimmed.fq.gz
-rw-r--r-- 1 jeremie jeremie  764 Mar 13 18:48 20200304_14_MP7624_S14_R1_001.fastq.gz_trimming_report.txt
-rw-r--r-- 1 jeremie jeremie  14M Mar 13 18:48 20200304_14_MP7624_S14_R1_001_trimmed.fq.gz
-rw-r--r-- 1 jeremie jeremie  763 Mar 13 18:48 20200304_15_MP7624_S15_R1_001.fastq.gz_trimming_report.txt
-rw-r--r-- 1 jeremie jeremie  14M Mar 13 18:48 20200304_15_MP7624_S15_R1_001_trimmed.fq.gz
-rw-r--r-- 1 jeremie jeremie  764 Mar 13 18:48 20200304_16_MP7624_S16_R1_001.fastq.gz_trimming_report.txt
-rw-r--r-- 1 jeremie jeremie  14M Mar 13 18:48 20200304_16_MP7624_S16_R1_001_trimmed.fq.gz
-rw-r--r-- 1 jeremie jeremie  764 Mar 13 18:48 20200304_17_MP7624_S17_R1_001.fastq.gz_trimming_report.txt
-rw-r--r-- 1 jeremie jeremie  14M Mar 13 18:48 20200304_17_MP7624_S17_R1_001_trimmed.fq.gz
-rw-r--r-- 1 jeremie jeremie  764 Mar 13 18:48 20200304_18_MP7624_S18_R1_001.fastq.gz_trimming_report.txt
-rw-r--r-- 1 jeremie jeremie  14M Mar 13 18:48 20200304_18_MP7624_S18_R1_001_trimmed.fq.gz
-rw-r--r-- 1 jeremie jeremie  764 Mar 13 18:48 20200304_19_MP7624_S19_R1_001.fastq.gz_trimming_report.txt
-rw-r--r-- 1 jeremie jeremie  15M Mar 13 18:48 20200304_19_MP7624_S19_R1_001_trimmed.fq.gz
In [ ]:
h.getSpikeInControlScales('../data/ERCC92/ERCC92.fa', fastQfolder='res/', mapper='bwa', pairedEnd=True, cores=10, pathtosam='samtools', pathtotrim_galore='../../TrimGalore-0.6.5/trim_galore', pathtobwa='bwa',totrim=False, tomap=True, tofilter=True, results='res/', toremove=True)
if paired_end, need to be name_*1, name_*2
using all files from folder
[('20200304_10_MP7624_S10_R1_001_val_1.fq.gz', '20200304_10_MP7624_S10_R2_001_val_2.fq.gz'), ('20200304_11_MP7624_S11_R1_001_val_1.fq.gz', '20200304_11_MP7624_S11_R2_001_val_2.fq.gz'), ('20200304_12_MP7624_S12_R1_001_val_1.fq.gz', '20200304_12_MP7624_S12_R2_001_val_2.fq.gz'), ('20200304_13_MP7624_S13_R1_001_val_1.fq.gz', '20200304_13_MP7624_S13_R2_001_val_2.fq.gz'), ('20200304_14_MP7624_S14_R1_001_val_1.fq.gz', '20200304_14_MP7624_S14_R2_001_val_2.fq.gz'), ('20200304_15_MP7624_S15_R1_001_val_1.fq.gz', '20200304_15_MP7624_S15_R2_001_val_2.fq.gz'), ('20200304_16_MP7624_S16_R1_001_val_1.fq.gz', '20200304_16_MP7624_S16_R2_001_val_2.fq.gz'), ('20200304_17_MP7624_S17_R1_001_val_1.fq.gz', '20200304_17_MP7624_S17_R2_001_val_2.fq.gz'), ('20200304_18_MP7624_S18_R1_001_val_1.fq.gz', '20200304_18_MP7624_S18_R2_001_val_2.fq.gz'), ('20200304_19_MP7624_S19_R1_001_val_1.fq.gz', '20200304_19_MP7624_S19_R2_001_val_2.fq.gz'), ('20200304_1_MP7624_S1_R1_001_val_1.fq.gz', '20200304_1_MP7624_S1_R2_001_val_2.fq.gz'), ('20200304_20_MP7624_S20_R1_001_val_1.fq.gz', '20200304_20_MP7624_S20_R2_001_val_2.fq.gz'), ('20200304_21_MP7624_S21_R1_001_val_1.fq.gz', '20200304_21_MP7624_S21_R2_001_val_2.fq.gz'), ('20200304_22_MP7624_S22_R1_001_val_1.fq.gz', '20200304_22_MP7624_S22_R2_001_val_2.fq.gz'), ('20200304_23_MP7624_S23_R1_001_val_1.fq.gz', '20200304_23_MP7624_S23_R2_001_val_2.fq.gz'), ('20200304_24_MP7624_S24_R1_001_val_1.fq.gz', '20200304_24_MP7624_S24_R2_001_val_2.fq.gz'), ('20200304_25_MP7624_S25_R1_001_val_1.fq.gz', '20200304_25_MP7624_S25_R2_001_val_2.fq.gz'), ('20200304_26_MP7624_S26_R1_001_val_1.fq.gz', '20200304_26_MP7624_S26_R2_001_val_2.fq.gz'), ('20200304_27_MP7624_S27_R1_001_val_1.fq.gz', '20200304_27_MP7624_S27_R2_001_val_2.fq.gz'), ('20200304_28_MP7624_S28_R1_001_val_1.fq.gz', '20200304_28_MP7624_S28_R2_001_val_2.fq.gz'), ('20200304_29_MP7624_S29_R1_001_val_1.fq.gz', '20200304_29_MP7624_S29_R2_001_val_2.fq.gz'), ('20200304_2_MP7624_S2_R1_001_val_1.fq.gz', '20200304_2_MP7624_S2_R2_001_val_2.fq.gz'), ('20200304_30_MP7624_S30_R1_001_val_1.fq.gz', '20200304_30_MP7624_S30_R2_001_val_2.fq.gz'), ('20200304_31_MP7624_S31_R1_001_val_1.fq.gz', '20200304_31_MP7624_S31_R2_001_val_2.fq.gz'), ('20200304_32_MP7624_S32_R1_001_val_1.fq.gz', '20200304_32_MP7624_S32_R2_001_val_2.fq.gz'), ('20200304_33_MP7624_S33_R1_001_val_1.fq.gz', '20200304_33_MP7624_S33_R2_001_val_2.fq.gz'), ('20200304_34_MP7624_S34_R1_001_val_1.fq.gz', '20200304_34_MP7624_S34_R2_001_val_2.fq.gz'), ('20200304_35_MP7624_S35_R1_001_val_1.fq.gz', '20200304_35_MP7624_S35_R2_001_val_2.fq.gz'), ('20200304_36_MP7624_S36_R1_001_val_1.fq.gz', '20200304_36_MP7624_S36_R2_001_val_2.fq.gz'), ('20200304_37_MP7624_S37_R1_001_val_1.fq.gz', '20200304_37_MP7624_S37_R2_001_val_2.fq.gz'), ('20200304_38_MP7624_S38_R1_001_val_1.fq.gz', '20200304_38_MP7624_S38_R2_001_val_2.fq.gz'), ('20200304_39_MP7624_S39_R1_001_val_1.fq.gz', '20200304_39_MP7624_S39_R2_001_val_2.fq.gz'), ('20200304_3_MP7624_S3_R1_001_val_1.fq.gz', '20200304_3_MP7624_S3_R2_001_val_2.fq.gz'), ('20200304_40_MP7624_S40_R1_001_val_1.fq.gz', '20200304_40_MP7624_S40_R2_001_val_2.fq.gz'), ('20200304_41_MP7624_S41_R1_001_val_1.fq.gz', '20200304_41_MP7624_S41_R2_001_val_2.fq.gz'), ('20200304_42_MP7624_S42_R1_001_val_1.fq.gz', '20200304_42_MP7624_S42_R2_001_val_2.fq.gz'), ('20200304_43_MP7624_S43_R1_001_val_1.fq.gz', '20200304_43_MP7624_S43_R2_001_val_2.fq.gz'), ('20200304_44_MP7624_S44_R1_001_val_1.fq.gz', '20200304_44_MP7624_S44_R2_001_val_2.fq.gz'), ('20200304_45_MP7624_S45_R1_001_val_1.fq.gz', '20200304_45_MP7624_S45_R2_001_val_2.fq.gz'), ('20200304_46_MP7624_S46_R1_001_val_1.fq.gz', '20200304_46_MP7624_S46_R2_001_val_2.fq.gz'), ('20200304_47_MP7624_S47_R1_001_val_1.fq.gz', '20200304_47_MP7624_S47_R2_001_val_2.fq.gz'), ('20200304_48_MP7624_S48_R1_001_val_1.fq.gz', '20200304_48_MP7624_S48_R2_001_val_2.fq.gz'), ('20200304_49_MP7624_S49_R1_001_val_1.fq.gz', '20200304_49_MP7624_S49_R2_001_val_2.fq.gz'), ('20200304_4_MP7624_S4_R1_001_val_1.fq.gz', '20200304_4_MP7624_S4_R2_001_val_2.fq.gz'), ('20200304_50_MP7624_S50_R1_001_val_1.fq.gz', '20200304_50_MP7624_S50_R2_001_val_2.fq.gz'), ('20200304_51_MP7624_S51_R1_001_val_1.fq.gz', '20200304_51_MP7624_S51_R2_001_val_2.fq.gz'), ('20200304_52_MP7624_S52_R1_001_val_1.fq.gz', '20200304_52_MP7624_S52_R2_001_val_2.fq.gz'), ('20200304_53_MP7624_S53_R1_001_val_1.fq.gz', '20200304_53_MP7624_S53_R2_001_val_2.fq.gz'), ('20200304_54_MP7624_S54_R1_001_val_1.fq.gz', '20200304_54_MP7624_S54_R2_001_val_2.fq.gz'), ('20200304_55_MP7624_S55_R1_001_val_1.fq.gz', '20200304_55_MP7624_S55_R2_001_val_2.fq.gz'), ('20200304_56_MP7624_S56_R1_001_val_1.fq.gz', '20200304_56_MP7624_S56_R2_001_val_2.fq.gz'), ('20200304_57_MP7624_S57_R1_001_val_1.fq.gz', '20200304_57_MP7624_S57_R2_001_val_2.fq.gz'), ('20200304_58_MP7624_S58_R1_001_val_1.fq.gz', '20200304_58_MP7624_S58_R2_001_val_2.fq.gz'), ('20200304_59_MP7624_S59_R1_001_val_1.fq.gz', '20200304_59_MP7624_S59_R2_001_val_2.fq.gz'), ('20200304_5_MP7624_S5_R1_001_val_1.fq.gz', '20200304_5_MP7624_S5_R2_001_val_2.fq.gz'), ('20200304_60_MP7624_S60_R1_001_val_1.fq.gz', '20200304_60_MP7624_S60_R2_001_val_2.fq.gz'), ('20200304_61_MP7624_S61_R1_001_val_1.fq.gz', '20200304_61_MP7624_S61_R2_001_val_2.fq.gz'), ('20200304_62_MP7624_S62_R1_001_val_1.fq.gz', '20200304_62_MP7624_S62_R2_001_val_2.fq.gz'), ('20200304_63_MP7624_S63_R1_001_val_1.fq.gz', '20200304_63_MP7624_S63_R2_001_val_2.fq.gz'), ('20200304_64_MP7624_S64_R1_001_val_1.fq.gz', '20200304_64_MP7624_S64_R2_001_val_2.fq.gz'), ('20200304_65_MP7624_S65_R1_001_val_1.fq.gz', '20200304_65_MP7624_S65_R2_001_val_2.fq.gz'), ('20200304_66_MP7624_S66_R1_001_val_1.fq.gz', '20200304_66_MP7624_S66_R2_001_val_2.fq.gz'), ('20200304_67_MP7624_S67_R1_001_val_1.fq.gz', '20200304_67_MP7624_S67_R2_001_val_2.fq.gz'), ('20200304_68_MP7624_S68_R1_001_val_1.fq.gz', '20200304_68_MP7624_S68_R2_001_val_2.fq.gz'), ('20200304_69_MP7624_S69_R1_001_val_1.fq.gz', '20200304_69_MP7624_S69_R2_001_val_2.fq.gz'), ('20200304_6_MP7624_S6_R1_001_val_1.fq.gz', '20200304_6_MP7624_S6_R2_001_val_2.fq.gz'), ('20200304_7_MP7624_S7_R1_001_val_1.fq.gz', '20200304_7_MP7624_S7_R2_001_val_2.fq.gz'), ('20200304_8_MP7624_S8_R1_001_val_1.fq.gz', '20200304_8_MP7624_S8_R2_001_val_2.fq.gz'), ('20200304_9_MP7624_S9_R1_001_val_1.fq.gz', '20200304_9_MP7624_S9_R2_001_val_2.fq.gz')]
you need to have your files in the res/ folder


mapping


In [16]:
terra.uploadFromFolder('amlproject','RNPv2/',
                       'broad-firecloud-ccle/hg38_RNAseq',samplesetname=sampleset,
                      fformat="fastqR1R2", sep='_MP7624')
please be sure you gave access to your terra email account access to this bucket
['RNPv2/20200304_10_MP7624_S10_R1_001.fastq.gz', 'RNPv2/20200304_10_MP7624_S10_R2_001.fastq.gz', 'RNPv2/20200304_11_MP7624_S11_R1_001.fastq.gz', 'RNPv2/20200304_11_MP7624_S11_R2_001.fastq.gz', 'RNPv2/20200304_12_MP7624_S12_R1_001.fastq.gz', 'RNPv2/20200304_12_MP7624_S12_R2_001.fastq.gz', 'RNPv2/20200304_13_MP7624_S13_R1_001.fastq.gz', 'RNPv2/20200304_13_MP7624_S13_R2_001.fastq.gz', 'RNPv2/20200304_14_MP7624_S14_R1_001.fastq.gz', 'RNPv2/20200304_14_MP7624_S14_R2_001.fastq.gz', 'RNPv2/20200304_15_MP7624_S15_R1_001.fastq.gz', 'RNPv2/20200304_15_MP7624_S15_R2_001.fastq.gz', 'RNPv2/20200304_16_MP7624_S16_R1_001.fastq.gz', 'RNPv2/20200304_16_MP7624_S16_R2_001.fastq.gz', 'RNPv2/20200304_17_MP7624_S17_R1_001.fastq.gz', 'RNPv2/20200304_17_MP7624_S17_R2_001.fastq.gz', 'RNPv2/20200304_18_MP7624_S18_R1_001.fastq.gz', 'RNPv2/20200304_18_MP7624_S18_R2_001.fastq.gz', 'RNPv2/20200304_19_MP7624_S19_R1_001.fastq.gz', 'RNPv2/20200304_19_MP7624_S19_R2_001.fastq.gz', 'RNPv2/20200304_1_MP7624_S1_R1_001.fastq.gz', 'RNPv2/20200304_1_MP7624_S1_R2_001.fastq.gz', 'RNPv2/20200304_20_MP7624_S20_R1_001.fastq.gz', 'RNPv2/20200304_20_MP7624_S20_R2_001.fastq.gz', 'RNPv2/20200304_21_MP7624_S21_R1_001.fastq.gz', 'RNPv2/20200304_21_MP7624_S21_R2_001.fastq.gz', 'RNPv2/20200304_22_MP7624_S22_R1_001.fastq.gz', 'RNPv2/20200304_22_MP7624_S22_R2_001.fastq.gz', 'RNPv2/20200304_23_MP7624_S23_R1_001.fastq.gz', 'RNPv2/20200304_23_MP7624_S23_R2_001.fastq.gz', 'RNPv2/20200304_24_MP7624_S24_R1_001.fastq.gz', 'RNPv2/20200304_24_MP7624_S24_R2_001.fastq.gz', 'RNPv2/20200304_25_MP7624_S25_R1_001.fastq.gz', 'RNPv2/20200304_25_MP7624_S25_R2_001.fastq.gz', 'RNPv2/20200304_26_MP7624_S26_R1_001.fastq.gz', 'RNPv2/20200304_26_MP7624_S26_R2_001.fastq.gz', 'RNPv2/20200304_27_MP7624_S27_R1_001.fastq.gz', 'RNPv2/20200304_27_MP7624_S27_R2_001.fastq.gz', 'RNPv2/20200304_28_MP7624_S28_R1_001.fastq.gz', 'RNPv2/20200304_28_MP7624_S28_R2_001.fastq.gz', 'RNPv2/20200304_29_MP7624_S29_R1_001.fastq.gz', 'RNPv2/20200304_29_MP7624_S29_R2_001.fastq.gz', 'RNPv2/20200304_2_MP7624_S2_R1_001.fastq.gz', 'RNPv2/20200304_2_MP7624_S2_R2_001.fastq.gz', 'RNPv2/20200304_30_MP7624_S30_R1_001.fastq.gz', 'RNPv2/20200304_30_MP7624_S30_R2_001.fastq.gz', 'RNPv2/20200304_31_MP7624_S31_R1_001.fastq.gz', 'RNPv2/20200304_31_MP7624_S31_R2_001.fastq.gz', 'RNPv2/20200304_32_MP7624_S32_R1_001.fastq.gz', 'RNPv2/20200304_32_MP7624_S32_R2_001.fastq.gz', 'RNPv2/20200304_33_MP7624_S33_R1_001.fastq.gz', 'RNPv2/20200304_33_MP7624_S33_R2_001.fastq.gz', 'RNPv2/20200304_34_MP7624_S34_R1_001.fastq.gz', 'RNPv2/20200304_34_MP7624_S34_R2_001.fastq.gz', 'RNPv2/20200304_35_MP7624_S35_R1_001.fastq.gz', 'RNPv2/20200304_35_MP7624_S35_R2_001.fastq.gz', 'RNPv2/20200304_36_MP7624_S36_R1_001.fastq.gz', 'RNPv2/20200304_36_MP7624_S36_R2_001.fastq.gz', 'RNPv2/20200304_37_MP7624_S37_R1_001.fastq.gz', 'RNPv2/20200304_37_MP7624_S37_R2_001.fastq.gz', 'RNPv2/20200304_38_MP7624_S38_R1_001.fastq.gz', 'RNPv2/20200304_38_MP7624_S38_R2_001.fastq.gz', 'RNPv2/20200304_39_MP7624_S39_R1_001.fastq.gz', 'RNPv2/20200304_39_MP7624_S39_R2_001.fastq.gz', 'RNPv2/20200304_3_MP7624_S3_R1_001.fastq.gz', 'RNPv2/20200304_3_MP7624_S3_R2_001.fastq.gz', 'RNPv2/20200304_40_MP7624_S40_R1_001.fastq.gz', 'RNPv2/20200304_40_MP7624_S40_R2_001.fastq.gz', 'RNPv2/20200304_41_MP7624_S41_R1_001.fastq.gz', 'RNPv2/20200304_41_MP7624_S41_R2_001.fastq.gz', 'RNPv2/20200304_42_MP7624_S42_R1_001.fastq.gz', 'RNPv2/20200304_42_MP7624_S42_R2_001.fastq.gz', 'RNPv2/20200304_43_MP7624_S43_R1_001.fastq.gz', 'RNPv2/20200304_43_MP7624_S43_R2_001.fastq.gz', 'RNPv2/20200304_44_MP7624_S44_R1_001.fastq.gz', 'RNPv2/20200304_44_MP7624_S44_R2_001.fastq.gz', 'RNPv2/20200304_45_MP7624_S45_R1_001.fastq.gz', 'RNPv2/20200304_45_MP7624_S45_R2_001.fastq.gz', 'RNPv2/20200304_46_MP7624_S46_R1_001.fastq.gz', 'RNPv2/20200304_46_MP7624_S46_R2_001.fastq.gz', 'RNPv2/20200304_47_MP7624_S47_R1_001.fastq.gz', 'RNPv2/20200304_47_MP7624_S47_R2_001.fastq.gz', 'RNPv2/20200304_48_MP7624_S48_R1_001.fastq.gz', 'RNPv2/20200304_48_MP7624_S48_R2_001.fastq.gz', 'RNPv2/20200304_49_MP7624_S49_R1_001.fastq.gz', 'RNPv2/20200304_49_MP7624_S49_R2_001.fastq.gz', 'RNPv2/20200304_4_MP7624_S4_R1_001.fastq.gz', 'RNPv2/20200304_4_MP7624_S4_R2_001.fastq.gz', 'RNPv2/20200304_50_MP7624_S50_R1_001.fastq.gz', 'RNPv2/20200304_50_MP7624_S50_R2_001.fastq.gz', 'RNPv2/20200304_51_MP7624_S51_R1_001.fastq.gz', 'RNPv2/20200304_51_MP7624_S51_R2_001.fastq.gz', 'RNPv2/20200304_52_MP7624_S52_R1_001.fastq.gz', 'RNPv2/20200304_52_MP7624_S52_R2_001.fastq.gz', 'RNPv2/20200304_53_MP7624_S53_R1_001.fastq.gz', 'RNPv2/20200304_53_MP7624_S53_R2_001.fastq.gz', 'RNPv2/20200304_54_MP7624_S54_R1_001.fastq.gz', 'RNPv2/20200304_54_MP7624_S54_R2_001.fastq.gz', 'RNPv2/20200304_55_MP7624_S55_R1_001.fastq.gz', 'RNPv2/20200304_55_MP7624_S55_R2_001.fastq.gz', 'RNPv2/20200304_56_MP7624_S56_R1_001.fastq.gz', 'RNPv2/20200304_56_MP7624_S56_R2_001.fastq.gz', 'RNPv2/20200304_57_MP7624_S57_R1_001.fastq.gz', 'RNPv2/20200304_57_MP7624_S57_R2_001.fastq.gz', 'RNPv2/20200304_58_MP7624_S58_R1_001.fastq.gz', 'RNPv2/20200304_58_MP7624_S58_R2_001.fastq.gz', 'RNPv2/20200304_59_MP7624_S59_R1_001.fastq.gz', 'RNPv2/20200304_59_MP7624_S59_R2_001.fastq.gz', 'RNPv2/20200304_5_MP7624_S5_R1_001.fastq.gz', 'RNPv2/20200304_5_MP7624_S5_R2_001.fastq.gz', 'RNPv2/20200304_60_MP7624_S60_R1_001.fastq.gz', 'RNPv2/20200304_60_MP7624_S60_R2_001.fastq.gz', 'RNPv2/20200304_61_MP7624_S61_R1_001.fastq.gz', 'RNPv2/20200304_61_MP7624_S61_R2_001.fastq.gz', 'RNPv2/20200304_62_MP7624_S62_R1_001.fastq.gz', 'RNPv2/20200304_62_MP7624_S62_R2_001.fastq.gz', 'RNPv2/20200304_63_MP7624_S63_R1_001.fastq.gz', 'RNPv2/20200304_63_MP7624_S63_R2_001.fastq.gz', 'RNPv2/20200304_64_MP7624_S64_R1_001.fastq.gz', 'RNPv2/20200304_64_MP7624_S64_R2_001.fastq.gz', 'RNPv2/20200304_65_MP7624_S65_R1_001.fastq.gz', 'RNPv2/20200304_65_MP7624_S65_R2_001.fastq.gz', 'RNPv2/20200304_66_MP7624_S66_R1_001.fastq.gz', 'RNPv2/20200304_66_MP7624_S66_R2_001.fastq.gz', 'RNPv2/20200304_67_MP7624_S67_R1_001.fastq.gz', 'RNPv2/20200304_67_MP7624_S67_R2_001.fastq.gz', 'RNPv2/20200304_68_MP7624_S68_R1_001.fastq.gz', 'RNPv2/20200304_68_MP7624_S68_R2_001.fastq.gz', 'RNPv2/20200304_69_MP7624_S69_R1_001.fastq.gz', 'RNPv2/20200304_69_MP7624_S69_R2_001.fastq.gz', 'RNPv2/20200304_6_MP7624_S6_R1_001.fastq.gz', 'RNPv2/20200304_6_MP7624_S6_R2_001.fastq.gz', 'RNPv2/20200304_7_MP7624_S7_R1_001.fastq.gz', 'RNPv2/20200304_7_MP7624_S7_R2_001.fastq.gz', 'RNPv2/20200304_8_MP7624_S8_R1_001.fastq.gz', 'RNPv2/20200304_8_MP7624_S8_R2_001.fastq.gz', 'RNPv2/20200304_9_MP7624_S9_R1_001.fastq.gz', 'RNPv2/20200304_9_MP7624_S9_R2_001.fastq.gz']
> /home/jeremie/JKBio/TerraFunction.py(227)uploadFromFolder()
    226     ipdb.set_trace()
--> 227     df = pd.DataFrame(data)
    228     print(df)

ipdb> c
      sample_id                                             fastq1  \
0   20200304_10  gs://amlproject/RNPv2/20200304_10_MP7624_S10_R...   
1   20200304_11  gs://amlproject/RNPv2/20200304_11_MP7624_S11_R...   
2   20200304_12  gs://amlproject/RNPv2/20200304_12_MP7624_S12_R...   
3   20200304_13  gs://amlproject/RNPv2/20200304_13_MP7624_S13_R...   
4   20200304_14  gs://amlproject/RNPv2/20200304_14_MP7624_S14_R...   
..          ...                                                ...   
64  20200304_69  gs://amlproject/RNPv2/20200304_69_MP7624_S69_R...   
65   20200304_6  gs://amlproject/RNPv2/20200304_6_MP7624_S6_R1_...   
66   20200304_7  gs://amlproject/RNPv2/20200304_7_MP7624_S7_R1_...   
67   20200304_8  gs://amlproject/RNPv2/20200304_8_MP7624_S8_R1_...   
68   20200304_9  gs://amlproject/RNPv2/20200304_9_MP7624_S9_R1_...   

                                               fastq2  
0   gs://amlproject/RNPv2/20200304_10_MP7624_S10_R...  
1   gs://amlproject/RNPv2/20200304_11_MP7624_S11_R...  
2   gs://amlproject/RNPv2/20200304_12_MP7624_S12_R...  
3   gs://amlproject/RNPv2/20200304_13_MP7624_S13_R...  
4   gs://amlproject/RNPv2/20200304_14_MP7624_S14_R...  
..                                                ...  
64  gs://amlproject/RNPv2/20200304_69_MP7624_S69_R...  
65  gs://amlproject/RNPv2/20200304_6_MP7624_S6_R2_...  
66  gs://amlproject/RNPv2/20200304_7_MP7624_S7_R2_...  
67  gs://amlproject/RNPv2/20200304_8_MP7624_S8_R2_...  
68  gs://amlproject/RNPv2/20200304_9_MP7624_S9_R2_...  

[69 rows x 3 columns]
Successfully imported 69 participants.
Successfully imported 69 samples.
Successfully imported 1 sample sets:
  * MAX_AML_RNPv2 (69 samples)
In [17]:
wm = dm.WorkspaceManager('broad-firecloud-ccle/hg38_RNAseq')
In [18]:
star = wm.get_config("star_v1-0_BETA_cfg")
star
Out[18]:
{'deleted': False,
 'inputs': {'star_workflow.star.outSAMattrRGline': '"ID:rg1 SM:sm1"',
  'star_workflow.star.num_threads': '8',
  'star_workflow.star.alignSoftClipAtReferenceEnds': '"Yes"',
  'star_workflow.star.alignMatesGapMax': '1000000',
  'star_workflow.star.outFilterType': '"BySJout"',
  'star_workflow.star.prefix': 'this.name',
  'star_workflow.star.num_preempt': '5',
  'star_workflow.star.alignSJoverhangMin': '8',
  'star_workflow.star.chimOutJunctionFormat': '1',
  'star_workflow.star.memory': '52',
  'star_workflow.star.fastq2': 'this.fastq2',
  'star_workflow.star.outFilterMismatchNoverLmax': '0.1',
  'star_workflow.star.alignIntronMax': '1000000',
  'star_workflow.star.star_index': 'workspace.star_index',
  'star_workflow.star.outSAMattributes': '"NH HI AS nM NM ch"',
  'star_workflow.star.outSAMstrandField': '"intronMotif"',
  'star_workflow.star.limitSjdbInsertNsj': '1200000',
  'star_workflow.star.chimSegmentMin': '15',
  'star_workflow.star.chimJunctionOverhangMin': '15',
  'star_workflow.star.fastq1': 'this.fastq1',
  'star_workflow.star.chimOutType': '"Junctions WithinBAM SoftClip"',
  'star_workflow.star.quantMode': '"TranscriptomeSAM GeneCounts"',
  'star_workflow.star.disk_space': '250',
  'star_workflow.star.outFilterScoreMinOverLread': '0.33',
  'star_workflow.star.chimMainSegmentMultNmax': '1',
  'star_workflow.star.alignIntronMin': '20',
  'star_workflow.star.outFilterMismatchNmax': '999',
  'star_workflow.star.outFilterMultimapNmax': '20',
  'star_workflow.star.outFilterMatchNminOverLread': '0.33',
  'star_workflow.star.alignSJDBoverhangMin': '1',
  'star_workflow.star.outFilterIntronMotifs': '"None"'},
 'methodConfigVersion': 1,
 'methodRepoMethod': {'methodName': 'star_v1-0_BETA',
  'methodVersion': 7,
  'methodNamespace': 'broadinstitute_gtex',
  'methodUri': 'agora://broadinstitute_gtex/star_v1-0_BETA/7',
  'sourceRepo': 'agora'},
 'name': 'star_v1-0_BETA_cfg',
 'namespace': 'broadinstitute_gtex',
 'outputs': {'star_workflow.star.chimeric_bam_file': 'this.star_chimeric_bam_file',
  'star_workflow.star.junctions': 'this.star_junctions',
  'star_workflow.star.transcriptome_bam': 'this.star_transcriptome_bam',
  'star_workflow.star.chimeric_junctions': 'this.star_chimeric_junctions',
  'star_workflow.star.junctions_pass1': 'this.star_junctions_pass1',
  'star_workflow.star.bam_file': 'this.star_bam_file',
  'star_workflow.star.read_counts': 'this.star_read_counts',
  'star_workflow.star.chimeric_bam_index': 'this.star_chimeric_bam_index',
  'star_workflow.star.bam_index': 'this.star_bam_index',
  'star_workflow.star.logs': 'this.star_logs'},
 'prerequisites': {},
 'rootEntityType': 'sample'}
In [19]:
submission_id = wm.create_submission(star['name'], sampleset, 'sample_set',expression='this.samples')
Successfully created submission 2ad41571-b46e-4c3b-be51-44e800717d2a.
In [20]:
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
1.0 of jobs Succeeded in submission 0.sion 0. 181 mn elapsed..
Out[20]:
[]
In [24]:
submission_id = wm.create_submission("rsem_v1-0_BETA_cfg", 
                                      sampleset,'sample_set',expression='this.samples')
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
Successfully created submission bc143fed-eeb3-4e32-b277-fbceb736b3a3.
1.0 of jobs Succeeded in submission 0.sion 0. 542 mn elapsed..
Out[24]:
[]
In [26]:
submission_id = wm.create_submission("rsem_aggregate_results_v1-0_BETA_cfg", 
                                         sampleset)
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
Successfully created submission 23e0a589-f993-4434-9f86-c56f2195fbcd.
1.0 of jobs Succeeded in submission 0.sion 0. 20 mn elapsed.
Out[26]:
[]
In [27]:
results = wm.get_sample_sets().loc[sampleset]
rsem_genes_expected_count = results['rsem_genes_expected_count']
In [28]:
results
Out[28]:
samples                            [20200304_10, 20200304_11, 20200304_12, 202003...
rsem_transcripts_isopct            gs://fc-secure-163bcce1-14a1-4cc2-b8f8-ec8bcba...
rsem_transcripts_tpm               gs://fc-secure-163bcce1-14a1-4cc2-b8f8-ec8bcba...
rsem_transcripts_expected_count    gs://fc-secure-163bcce1-14a1-4cc2-b8f8-ec8bcba...
rsem_genes_tpm                     gs://fc-secure-163bcce1-14a1-4cc2-b8f8-ec8bcba...
rsem_genes_expected_count          gs://fc-secure-163bcce1-14a1-4cc2-b8f8-ec8bcba...
Name: MAX_AML_RNPv2, dtype: object
In [29]:
mkdir ../../data/RNPv2
In [30]:
! gsutil cp $rsem_genes_expected_count ../../data/RNPv2/
Copying gs://fc-secure-163bcce1-14a1-4cc2-b8f8-ec8bcbabe2da/23e0a589-f993-4434-9f86-c56f2195fbcd/rsem_aggregate_results_workflow/a83cec92-1d87-41ad-8db3-a8a93b48baf5/call-rsem_aggregate_results/MAX_AML_RNPv2.rsem_genes_expected_count.txt.gz...
/ [1 files][  4.4 MiB/  4.4 MiB]                                                
Operation completed over 1 objects/4.4 MiB.                                      
In [37]:
file = '../../data/RNPv2/'+rsem_genes_expected_count.split('/')[-1]
In [38]:
file
Out[38]:
'../../data/RNPv2/MAX_AML_RNPv2.rsem_genes_expected_count.txt.gz'
In [39]:
! gunzip $file
In [40]:
rsem_genes_expected_count = pd.read_csv(file[:-3], sep='\t')
In [41]:
data = rsem_genes_expected_count.drop("transcript_id(s)",1)
In [43]:
data["gene_id"] = h.convertGenes(data['gene_id'])[0]
you need access to taiga for this (https://pypi.org/project/taigapy/)
20702 could not be parsed... we don't have all genes already
In [44]:
data=data.set_index('gene_id')
In [46]:
data
Out[46]:
20200304_1 20200304_10 20200304_11 20200304_12 20200304_13 20200304_14 20200304_15 20200304_16 20200304_17 20200304_18 ... 20200304_63 20200304_64 20200304_65 20200304_66 20200304_67 20200304_68 20200304_69 20200304_7 20200304_8 20200304_9
gene_id
TSPAN6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.0 0.0 0.0 0.00 0.00 0.00 0.00 0.00 0.00
TNMD 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.0 0.0 0.0 0.00 0.00 0.00 0.00 0.00 0.00
DPM1 1619.00 2465.00 1701.00 1535.00 1863.00 2093.00 2027.00 2202.00 2148.00 2235.00 ... 3272.00 3686.0 3990.0 4714.0 1620.00 1840.00 1729.00 1983.00 2451.00 2378.00
SCYL3 464.57 846.12 672.69 603.75 577.41 617.97 601.43 545.49 575.14 536.97 ... 961.52 1024.2 1155.4 1316.6 430.78 460.04 437.36 542.42 670.02 576.38
C1orf112 780.43 1031.90 755.31 676.25 1232.70 1209.00 1309.60 1370.50 1245.90 1257.10 ... 1647.50 2260.8 2422.6 2757.4 949.22 1277.00 1032.60 1163.60 1481.00 1332.90
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ERCC-00164 3.00 5.00 8.00 2.00 2.00 1.00 2.00 1.00 3.00 3.00 ... 7.00 3.0 3.0 4.0 1.00 1.00 5.00 1.00 2.00 4.00
ERCC-00165 215.00 594.00 424.00 509.00 136.00 88.00 165.00 258.00 161.00 163.00 ... 693.00 318.0 221.0 314.0 93.00 139.00 87.00 127.00 187.00 176.00
ERCC-00168 3.00 12.00 9.00 8.00 0.00 8.00 0.00 5.00 5.00 1.00 ... 10.00 6.0 5.0 8.0 3.00 4.00 1.00 3.00 8.00 3.00
ERCC-00170 66.00 205.00 133.00 211.00 57.00 40.00 73.00 94.00 42.00 40.00 ... 158.00 102.0 117.0 132.0 41.00 56.00 33.00 50.00 89.00 88.00
ERCC-00171 13554.00 40900.00 29090.00 33242.00 10039.00 6399.00 10836.00 15684.00 9526.00 8893.00 ... 51554.00 18598.0 15868.0 22396.0 7058.00 7576.00 5882.00 8381.00 10492.00 12389.00

58813 rows × 69 columns

In [59]:
rename = {"1": "mr120-MV411-RNP_IRF2BP2-r4",
"2": "mr121-MV411-RNP_IRF2BP2-r5",
"3": "mr122-MV411-RNP_IRF2BP2-r6",
"4": "mr123-MV411-RNP_IRF8-r4",
"5": "mr124-MV411-RNP_IRF8-r5",
"6": "mr125-MV411-RNP_IRF8-r6",
"7": "mr126-MV411-RNP_MEF2D-r4",
"8": "mr127-MV411-RNP_MEF2D-r5",
"9": "mr128-MV411-RNP_MEF2D-r6",
"10": "mr129-MV411-RNP_MYC-r4",
"11": "mr130-MV411-RNP_MYC-r5",
"12": "mr131-MV411-RNP_MYC-r6",
"13": "mr132-MV411-RNP_RUNX1-r4",
"14": "mr133-MV411-RNP_RUNX1-r5",
"15": "mr134-MV411-RNP_RUNX1-r6",
"16": "mr135-MV411-RNP_RUNX2-r4",
"17": "mr136-MV411-RNP_RUNX2-r5",
"18": "mr137-MV411-RNP_RUNX2-r6",
"19": "mr138-MV411-RNP_SPI1-r4",
"20": "mr139-MV411-RNP_SPI1-r5",
"21": "mr140-MV411-RNP_SPI1-r6",
"22": "mr141-MV411-RNP_ZMYND8-r4",
"23": "mr142-MV411-RNP_ZMYND8-r5",
"24": "mr143-MV411-RNP_ZMYND8-r6",
"25": "mr144-MV411-RNP_LMO2-r4",
"26": "mr145-MV411-RNP_LMO2-r5",
"27": "mr146-MV411-RNP_LMO2-r6",
"28": "mr147-MV411-RNP_LYL1-r4",
"29": "mr148-MV411-RNP_LYL1-r5",
"30": "mr149-MV411-RNP_LYL1-r6",
"31": "mr150-MV411-RNP_MAX-r4",
"32": "mr151-MV411-RNP_MAX-r5",
"33": "mr152-MV411-RNP_MAX-r6",
"34": "mr153-MV411-RNP_ZEB2-r4",
"35": "mr154-MV411-RNP_ZEB2-r5",
"36": "mr155-MV411-RNP_ZEB2-r6",
"37": "mr156-MV411-RNP_MEF2C-r4",
"38": "mr157-MV411-RNP_MEF2C-r5",
"39": "mr158-MV411-RNP_MEF2C-r6",
"40": "mr159-MV411-RNP_MEIS1-r4",
"41": "mr160-MV411-RNP_MEIS1-r5",
"42": "mr161-MV411-RNP_MEIS1-r6",
"43": "mr162-MV411-RNP_FLI1-r4",
"44": "mr163-MV411-RNP_FLI1-r5",
"45": "mr164-MV411-RNP_FLI1-r6",
"46": "mr165-MV411-RNP_ELF2-r4",
"47": "mr166-MV411-RNP_ELF2-r5",
"48": "mr167-MV411-RNP_ELF2-r6",
"49": "mr168-MV411-RNP_GFI1-r4",
"50": "mr169-MV411-RNP_GFI1-r5",
"51": "mr170-MV411-RNP_GFI1-r6",
"52": "mr171-MV411-RNP_IKZF1-r4",
"53": "mr172-MV411-RNP_IKZF1-r5",
"54": "mr173-MV411-RNP_IKZF1-r6",
"55": "mr174-MV411-RNP_CEBPA-r4",
"56": "mr175-MV411-RNP_CEBPA-r5",
"57": "mr176-MV411-RNP_CEBPA-r6",
"58": "mr177-MV411-RNP_MYB-r4",
"59": "mr178-MV411-RNP_MYB-r5",
"60": "mr179-MV411-RNP_MYB-r6",
"61": "mr180-MV411-RNP_MYBL2-r1",
"62": "mr181-MV411-RNP_MYBL2-r2",
"63": "mr182-MV411-RNP_MYBL2-r3",
"64": "mr183-MV411-RNP_HOXA9-r4",
"65": "mr184-MV411-RNP_HOXA9-r5",
"66": "mr185-MV411-RNP_HOXA9-r6",
"67": "mr186-MV411-RNP_AAVS1-r1",
"68": "mr187-MV411-RNP_AAVS1-r2",
"69": "mr188-MV411-RNP_AAVS1-r3",
"70": "mr189-MV411-RNP_SP1-r4",
"71": "mr190-MV411-RNP_SP1-r5",
"72": "mr191-MV411-RNP_SP1-r6",
"73": "mr192-MV411-RNP_SP1-r7"}
In [60]:
data.columns = [rename[i.split('_')[1]] for i in data.columns]
In [61]:
data
Out[61]:
mr120-MV411-RNP_IRF2BP2-r4 mr129-MV411-RNP_MYC-r4 mr130-MV411-RNP_MYC-r5 mr131-MV411-RNP_MYC-r6 mr132-MV411-RNP_RUNX1-r4 mr133-MV411-RNP_RUNX1-r5 mr134-MV411-RNP_RUNX1-r6 mr135-MV411-RNP_RUNX2-r4 mr136-MV411-RNP_RUNX2-r5 mr137-MV411-RNP_RUNX2-r6 ... mr182-MV411-RNP_MYBL2-r3 mr183-MV411-RNP_HOXA9-r4 mr184-MV411-RNP_HOXA9-r5 mr185-MV411-RNP_HOXA9-r6 mr186-MV411-RNP_AAVS1-r1 mr187-MV411-RNP_AAVS1-r2 mr188-MV411-RNP_AAVS1-r3 mr126-MV411-RNP_MEF2D-r4 mr127-MV411-RNP_MEF2D-r5 mr128-MV411-RNP_MEF2D-r6
gene_id
TSPAN6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.0 0.0 0.0 0.00 0.00 0.00 0.00 0.00 0.00
DPM1 1619.00 2465.00 1701.00 1535.00 1863.00 2093.00 2027.00 2202.00 2148.00 2235.00 ... 3272.00 3686.0 3990.0 4714.0 1620.00 1840.00 1729.00 1983.00 2451.00 2378.00
SCYL3 464.57 846.12 672.69 603.75 577.41 617.97 601.43 545.49 575.14 536.97 ... 961.52 1024.2 1155.4 1316.6 430.78 460.04 437.36 542.42 670.02 576.38
C1orf112 780.43 1031.90 755.31 676.25 1232.70 1209.00 1309.60 1370.50 1245.90 1257.10 ... 1647.50 2260.8 2422.6 2757.4 949.22 1277.00 1032.60 1163.60 1481.00 1332.90
FGR 1443.00 8556.00 6387.00 5955.00 2359.00 2615.00 2258.00 3340.00 3229.00 3466.00 ... 4120.00 4514.0 4748.0 5478.0 2323.00 2401.00 2230.00 3680.00 4706.00 4308.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ERCC-00164 3.00 5.00 8.00 2.00 2.00 1.00 2.00 1.00 3.00 3.00 ... 7.00 3.0 3.0 4.0 1.00 1.00 5.00 1.00 2.00 4.00
ERCC-00165 215.00 594.00 424.00 509.00 136.00 88.00 165.00 258.00 161.00 163.00 ... 693.00 318.0 221.0 314.0 93.00 139.00 87.00 127.00 187.00 176.00
ERCC-00168 3.00 12.00 9.00 8.00 0.00 8.00 0.00 5.00 5.00 1.00 ... 10.00 6.0 5.0 8.0 3.00 4.00 1.00 3.00 8.00 3.00
ERCC-00170 66.00 205.00 133.00 211.00 57.00 40.00 73.00 94.00 42.00 40.00 ... 158.00 102.0 117.0 132.0 41.00 56.00 33.00 50.00 89.00 88.00
ERCC-00171 13554.00 40900.00 29090.00 33242.00 10039.00 6399.00 10836.00 15684.00 9526.00 8893.00 ... 51554.00 18598.0 15868.0 22396.0 7058.00 7576.00 5882.00 8381.00 10492.00 12389.00

38779 rows × 69 columns

pre processing

filter some more

In [48]:
toremove = np.argwhere(data.values.var(1)==0)
toremove.ravel()
Out[48]:
array([    1,    15,    24, ..., 58714, 58715, 58718])
In [49]:
toremove.shape
Out[49]:
(19999, 1)
In [50]:
data = data.drop(data.iloc[toremove.ravel()].index,0)
In [51]:
data.shape
Out[51]:
(38779, 69)

renormalize the data

Getting the Core TF information

In [53]:
ctf=pd.read_csv('../data/CTF.csv',header=None)[0].values.tolist()
ctf
Out[53]:
['MYC',
 'MYB',
 'SPI1',
 'RUNX1',
 'GSE1',
 'IRF2BP2',
 'FLI1',
 'ELF2',
 'ZEB2',
 'IKAROS',
 'GFI1',
 'LMO2',
 'CEBPA',
 'MEF2D',
 'MEF2C',
 'IRF8',
 'MEIS1',
 'RUNX2',
 'ETV6',
 'LDB1',
 'RUNX2',
 'SP1',
 'ZMYND8']
In [54]:
genenames = data.index
ctfpos = [i for i, val in enumerate(genenames) if val in ctf]
notctfpos = [i for i, val in enumerate(genenames) if val not in ctf]

We find a CTF not in the dataset

In [55]:
[val for val in ctf if val not in genenames]
Out[55]:
['IKAROS']
In [56]:
ctf.remove('IKAROS')
In [67]:
data = data.reset_index(drop=True)

DESEQ ANALYSIS

In [72]:
experiments = list(set([i.split('-')[2] for i in data.columns[:-1]]))
In [74]:
experiments.remove("RNP_AAVS1")
In [86]:
for val in experiments:  
    design = pd.DataFrame(index=data.columns[:-1], columns=['DMSO','Target'], 
                          data=np.array([[1 if 'RNP_AAVS1' in i else 0 for i in data.columns[:-1]],[1 if val in i else 0 for i in data.columns[:-1]]]).T)
    design.index = design.index.astype(str).str.replace('-','.')
    deseq = pyDESeq2.pyDESeq2(count_matrix=data, design_matrix = design, 
                              design_formula='~DMSO + Target', gene_column="gene_id")
    deseq.run_deseq()
    deseq.get_deseq_result()
    r = deseq.deseq_result
    r.pvalue = np.nan_to_num(np.array(r.pvalue), 1)
    r.log2FoldChange = np.nan_to_num(np.array(r.log2FoldChange), 0)
    results[val] = r
3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 207 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 194 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 206 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 207 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 199 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 203 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 208 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 203 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 206 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 201 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 201 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 253 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 204 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 201 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 203 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 203 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 208 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 154 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

3.2.6
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 202 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

what are most downregulated (volcano)

In [88]:
results.pvalue = np.nan_to_num(np.array(results.pvalue), 1)
results.log2FoldChange = np.nan_to_num(np.array(results.log2FoldChange), 0)
results.gene_id = convertGenes(results.gene_id)[0]
show(volcano(results,tohighlight=ctf))
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-88-6faadb1a20a7> in <module>
----> 1 results.pvalue = np.nan_to_num(np.array(results.pvalue), 1)
      2 results.log2FoldChange = np.nan_to_num(np.array(results.log2FoldChange), 0)
      3 results.gene_id = convertGenes(results.gene_id)[0]
      4 show(volcano(results,tohighlight=ctf))

~/.local/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5177             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5178                 return self[name]
-> 5179             return object.__getattribute__(self, name)
   5180 
   5181     def __setattr__(self, name, value):

AttributeError: 'Series' object has no attribute 'pvalue'

CTF (volcano)

In [ ]:
results
In [95]:
for val in experiments:
    a = h.volcano(results[val],tohighlight=ctf,title=val, maxvalue= 60, searchbox=True, minlogfold=0.5)
    try:
        show(a)
    except RuntimeError:
        show(a)

any bias in the data

In [126]:
datad = data
In [127]:
data = data.drop(columns='mr129-MV411-RNP_MYC-r4')
In [136]:
col = {v:i for i, v in enumerate(set([i.split('-')[2] for i in data.columns[:-1]]))}
In [137]:
red = PCA(2).fit_transform(data[data.columns[:-1]].T)
h.scatter(red, labels=data.columns[:-1], radi=60000, colors=[col[i.split('-')[2]] for i in data.columns[:-1]])
Out[137]:
Figure(
id = '32097', …)
In [146]:
red = PCA(30).fit_transform(data[data.columns[:-1]].T)
red = TSNE(2,4).fit_transform(red)
In [141]:
red.shape
Out[141]:
(69, 2)

mr129-MYC-r4 seems weird

In [147]:
h.scatter(red, labels=data.columns[:-1], radi=10, colors=[col[i.split('-')[2]] for i in data.columns[:-1]])
Out[147]:
Figure(
id = '36372', …)
In [ ]:
pca = PCA(20)
red = pca.fit_transform(data.T)
In [ ]:
pca.explained_variance_ratio_

GSEA analysis

In [167]:
res = {}
In [133]:
data = datad
In [150]:
totest
Out[150]:
mr123-MV411-RNP_IRF8-r4 mr124-MV411-RNP_IRF8-r5 mr125-MV411-RNP_IRF8-r6 mr186-MV411-RNP_AAVS1-r1 mr187-MV411-RNP_AAVS1-r2 mr188-MV411-RNP_AAVS1-r3
0 0.00 0.00 0.00 0.00 0.00 0.00
1 2211.00 2243.00 2269.00 1620.00 1840.00 1729.00
2 611.42 621.91 622.93 430.78 460.04 437.36
3 1390.60 1268.10 1244.10 949.22 1277.00 1032.60
4 3652.00 3917.00 4442.00 2323.00 2401.00 2230.00
... ... ... ... ... ... ...
38774 2.00 4.00 0.00 1.00 1.00 5.00
38775 165.00 119.00 130.00 93.00 139.00 87.00
38776 2.00 4.00 0.00 3.00 4.00 1.00
38777 51.00 52.00 31.00 41.00 56.00 33.00
38778 8976.00 7816.00 9319.00 7058.00 7576.00 5882.00

38779 rows × 6 columns

In [152]:
data = data.set_index('gene_id',drop=True)
In [168]:
res[val]
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-168-edbe59d9522e> in <module>
----> 1 res[val]

KeyError: 'RNP_IRF8'
In [169]:
for val in experiments:
    print(val)
    totest = data[[v for v in data.columns[:-1] if val in v or 'AAVS1' in v]]
    cls = ['Condition' if val in v else 'DMSO' for v in totest.columns]
    res[val] = gseapy.gsea(data=totest, gene_sets='WikiPathways_2013', 
                cls= cls, no_plot=False, processes=10)
    res[val].res2d['Term'] = [i for i in res[val].res2d.index]
    sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
                hue_order="geneset_size").set_title(val)
RNP_IRF8
RNP_SPI1
RNP_FLI1
RNP_CEBPA
RNP_MYC
RNP_MYB
RNP_ELF2
RNP_GFI1
RNP_RUNX1
RNP_IRF2BP2
RNP_MEF2D
RNP_IKZF1
RNP_MEF2C
RNP_LMO2
RNP_MYBL2
RNP_MAX
RNP_ZMYND8
RNP_LYL1
RNP_HOXA9
RNP_RUNX2
RNP_ZEB2
RNP_MEIS1
In [170]:
with open('../data/wikipathway_RNPv2', 'wb') as f:
    pickle.dump(res,f)
In [158]:
with open('../data/wikipathway_RNPv2','rb') as f:
    res = pickle.load(f)
In [172]:
for i, val in enumerate(experiments):
    plt.figure(i)
    res[val].res2d.Term = [i[2:-13] for i in res[val].res2d.index]
    sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
                hue_order="geneset_size").set_title(val)
/home/jeremie/.local/lib/python3.7/site-packages/ipykernel_launcher.py:2: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  
In [174]:
a = set()
for k, val in res.items():
    a.update(set(val.res2d.index))
a = {i:[0]*len(res) for i in a}
for n,(k, val) in enumerate(res.items()):
    for i,v in val.res2d.iterrows():
        a[i][n] = v.es
res = pd.DataFrame(a, index=res.keys())
fig, ax = plt.subplots(figsize=(20,15))
sns.heatmap(ax=ax,data=res)
Out[174]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb2cdbb0590>
In [175]:
model = AgglomerativeClustering(n_clusters=6,linkage="average", 
                                affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(res)
ii = itertools.count(res.shape[0])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
a = plotCorrelationMatrix(res.values[sort],res.index[sort].tolist(),interactive=True)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-175-18b55e342942> in <module>
      5 tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
      6 sort = labels.argsort()
----> 7 a = plotCorrelationMatrix(res.values[sort],res.index[sort].tolist(),interactive=True)

NameError: name 'plotCorrelationMatrix' is not defined
In [176]:
fig, ax = plt.subplots(figsize=(20,15))
sns.heatmap(ax=ax,data=res)
Out[176]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb2cc92bc10>
In [ ]:
fig.savefig("enriched_terms.png")
In [ ]:
show(a)
In [ ]:
fi
In [ ]:
experiments
In [ ]:
data
In [179]:
res = {}
In [181]:
for i, val in enumerate(experiments):
    print(val)
    totest = data[[v for v in data.columns[:-1] if val in v or 'AAVS1' in v]]
    cls = ['Condition' if val in v else 'DMSO' for v in totest.columns]
    res[val] = gseapy.gsea(data=totest, gene_sets='GO_Biological_Process_2015', 
                cls= cls, no_plot=False, processes=14)
    res[val].res2d['Term'] = [i for i in res[val].res2d.index]
    plt.figure(i)
    sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
                hue_order="geneset_size").set_title(val)
RNP_IRF8
RNP_SPI1
RNP_FLI1
RNP_CEBPA
RNP_MYC
RNP_MYB
RNP_ELF2
RNP_GFI1
RNP_RUNX1
RNP_IRF2BP2
RNP_MEF2D
RNP_IKZF1
RNP_MEF2C
RNP_LMO2
RNP_MYBL2
RNP_MAX
RNP_ZMYND8
RNP_LYL1
RNP_HOXA9
RNP_RUNX2
RNP_ZEB2
/home/jeremie/.local/lib/python3.7/site-packages/ipykernel_launcher.py:8: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  
RNP_MEIS1
In [182]:
with open('../data/GO_Biological_Process_2015_RNPv2', 'wb') as f:
    pickle.dump(res,f)
In [ ]:
with open('GO_Biological_Process_2015','rb') as f:
    res = pickle.load(f)

creating matrices

In [183]:
a = set()
for k, val in res.items():
    a.update(set(val.res2d.Term))
a = {i:[0]*len(res) for i in a}
for n,(k, val) in enumerate(res.items()):
    for i,v in val.res2d.iterrows():
        a[v.Term][n] = v.es
res = pd.DataFrame(a, index=res.keys())
fig, ax = plt.subplots(figsize=(20,15))
sns.heatmap(ax=ax,data=res)
Out[183]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb2674cd8d0>
In [205]:
model = AgglomerativeClustering(n_clusters=8,linkage="average", 
                                affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(res)
ii = itertools.count(res.shape[0])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
In [206]:
sort = labels.argsort()
In [207]:
a = h.plotCorrelationMatrix(res.values[sort],res.index[sort].tolist(),interactive=True,title="RNP2_bioproc_corr")
BokehUserWarning: ColumnDataSource's columns must be of the same length. Current lengths: ('alphas', 484), ('colors', 484), ('data', 22), ('xname', 484), ('yname', 484)
/home/jeremie/.local/lib/python3.7/site-packages/bokeh/io/saving.py:126: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/home/jeremie/.local/lib/python3.7/site-packages/bokeh/io/saving.py:139: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-207-d342427fa33f> in <module>
----> 1 a = h.plotCorrelationMatrix(res.values[sort],res.index[sort].tolist(),interactive=True,title="RNP2_bioproc_corr")

~/JKBio/Helper.py in plotCorrelationMatrix(data, names, colors, title, dataIsCorr, invert, size, interactive, rangeto)
    390     except:
    391       show(p)
--> 392     save(p, title + '.html')
    393 
    394     return p  # show the plot

~/.local/lib/python3.7/site-packages/bokeh/io/saving.py in save(obj, filename, resources, title, template, state, **kwargs)
     84 
     85     filename, resources, title = _get_save_args(state, filename, resources, title)
---> 86     _save_helper(obj, filename, resources, title, template)
     87     return abspath(filename)
     88 

~/.local/lib/python3.7/site-packages/bokeh/io/saving.py in _save_helper(obj, filename, resources, title, template)
    146     '''
    147     from ..embed import file_html
--> 148     html = file_html(obj, resources, title=title, template=template)
    149 
    150     with io.open(filename, mode="w", encoding="utf-8") as f:

~/.local/lib/python3.7/site-packages/bokeh/embed/standalone.py in file_html(models, resources, title, template, template_variables, theme, suppress_callback_warning, _always_new)
    288         models = models.roots
    289 
--> 290     with OutputDocumentFor(models, apply_theme=theme, always_new=_always_new) as doc:
    291         (docs_json, render_items) = standalone_docs_json_and_render_items(models, suppress_callback_warning=suppress_callback_warning)
    292         title = _title_from_models(models, title)

/usr/lib/python3.7/contextlib.py in __enter__(self)
    110         del self.args, self.kwds, self.func
    111         try:
--> 112             return next(self.gen)
    113         except StopIteration:
    114             raise RuntimeError("generator didn't yield") from None

~/.local/lib/python3.7/site-packages/bokeh/embed/util.py in OutputDocumentFor(objs, apply_theme, always_new)
    136             doc = Document()
    137             for model in objs:
--> 138                 doc.add_root(model)
    139 
    140         # handle a single shared document

~/.local/lib/python3.7/site-packages/bokeh/document/document.py in add_root(self, model, setter)
    302             self._roots.append(model)
    303         finally:
--> 304             self._pop_all_models_freeze()
    305         self._trigger_on_change(RootAddedEvent(self, model, setter))
    306 

~/.local/lib/python3.7/site-packages/bokeh/document/document.py in _pop_all_models_freeze(self)
   1017         self._all_models_freeze_count -= 1
   1018         if self._all_models_freeze_count == 0:
-> 1019             self._recompute_all_models()
   1020 
   1021     def _recompute_all_models(self):

~/.local/lib/python3.7/site-packages/bokeh/document/document.py in _recompute_all_models(self)
   1040             d._detach_document()
   1041         for a in to_attach:
-> 1042             a._attach_document(self)
   1043         self._all_models = recomputed
   1044         self._all_models_by_name = recomputed_by_name

~/.local/lib/python3.7/site-packages/bokeh/model.py in _attach_document(self, doc)
    725         '''
    726         if self._document is not None and self._document is not doc:
--> 727             raise RuntimeError("Models must be owned by only a single document, %r is already in a doc" % (self))
    728         doc.theme.apply_to_model(self)
    729         self._document = doc

RuntimeError: Models must be owned by only a single document, Rect(id='43474', ...) is already in a doc
In [ ]:
cluster1= ['LMO2','LYL1','MAX','MEF2C']
cluster2=['GFI1','FLI1','MYB','IKZF1','ELF2','CEBPa','MEIS1']
cluster3=['IRF2BP2','MEF2C','CDK6','MEF2D','IRF8','BRD4','MYC']
cluster4= ['RUNX1','RUNX2','ZMYND8']
In [ ]:
res.loc[cluster2].mean().sort_values()
In [ ]:
'GO_Molecular_Function_2015',
'GeneSigDB',
'ENCODE_TF_ChIP-seq_2014',
#'Drug_Perturbations_from_GEO_2014',
'GO_Cellular_Component_2015',
'GO_Biological_Process_2015',
'PPI_Hub_Proteins',
'WikiPathways_2013',
'TF-LOF_Expression_from_GEO',
# msig db C2 C6 H http://software.broadinstitute.org/gsea/msigdb/annotate.jsp
# max's crc

compare to the drop of CTF

In [ ]:
ctf = [
 'BRD4',
 'CDK6',
 'CEBPA',
 'ELF2',
 'FLI1',
 'GFI1',
 'IKZF1',
 'IRF2BP2',
 'IRF8',
 'LMO2',
 'LYL1',
 'MAX',
 'MEF2C',
 'MEF2D',
 'MEIS1',
 'MYB',
 'MYC',
 'RUNX1',
 'RUNX2',
 'SPI1',
 'ZEB2',
 'ZMYND8'
]
In [ ]:
deseq = pd.DataFrame()
for k, val in results.items():
    deseq[k] = val.log2FoldChange
deseq=deseq.T
In [ ]:
deseq
In [ ]:
a = plotCorrelationMatrix(a, deseq.index[sort].tolist(),interactive=True)
In [ ]:
ctf[11] = 'CEBPa'
ctf[]
In [ ]:
ctf

dropping ETV6 SP1 GSE1 LDB1

In [ ]:
deseq.loc[['MYC',
 'MYB',
 'SPI1',
 'RUNX1',
 'IRF2BP2',
 'FLI1',
 'ELF2',
 'ZEB2',
 'GFI1',
 'LMO2',
 'CEBPa',
 'MEF2D',
 'MEF2C',
 'IRF8',
 'MEIS1',
 'RUNX2',
 'RUNX2',
 'ZMYND8']]
In [ ]:
show(a)
In [ ]:
deseq_ctf = deseq.loc[['MYC',
 'MYB',
 'SPI1',
 'RUNX1',
 'IRF2BP2',
 'FLI1',
 'ELF2',
 'ZEB2',
 'GFI1',
 'LMO2',
 'CEBPa',
 'MEF2D',
 'MEF2C',
 'IRF8',
 'MEIS1',
 'RUNX2',
 'ZMYND8']]
model = AgglomerativeClustering(n_clusters=7,linkage="average", 
                                affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(deseq_ctf)
ii = itertools.count(deseq_ctf.shape[0])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
a = deseq_ctf.values[sort]
a = plotCorrelationMatrix(a, deseq_ctf.index[sort].tolist(),interactive=True)
In [ ]:
show(a)
In [ ]:
model = AgglomerativeClustering(n_clusters=7,linkage="average", 
                                affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(deseq)
ii = itertools.count(deseq.shape[0])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
a = deseq.values[sort]
a = plotCorrelationMatrix(a, deseq.index[sort].tolist(),interactive=True)
In [ ]:
show(a)

tsne, pca, clustering accross TF, CRC, most var genes, both ways.